#!/usr/bin/perl -w

#=======================================================================
# find_inv_utf8
# File ID: d971a582-f742-11dd-9aeb-000475e441b9
# Read lines from stdin and print those who contains eight-bit 
# characters not encoded in UTF-8.
#
# Character set: UTF-8
# ©opyleft 2001– Øyvind A. Holm <sunny@sunbase.org>
# License: GNU General Public License version 2 or later, see end of 
# file for legal stuff.
#=======================================================================

use strict;
use Getopt::Long;

$| = 1;

our $Debug = 0;
my $Retval = 0;

our %Opt = (

    'debug' => 0,
    'help' => 0,
    'quiet' => 0,
    'skip-invalid' => 0,
    'verbose' => 0,
    'version' => 0,
    'zero' => 0,

);

our $progname = $0;
$progname =~ s/^.*\/(.*?)$/$1/;
our $VERSION = "0.00";

Getopt::Long::Configure("bundling");
GetOptions(

    "debug" => \$Opt{'debug'},
    "help|h" => \$Opt{'help'},
    "quiet|q" => \$Opt{'quiet'},
    "skip-invalid|s" => \$Opt{'skip-invalid'},
    "verbose|v+" => \$Opt{'verbose'},
    "version" => \$Opt{'version'},
    "zero|z" => \$Opt{'zero'},

) || die("$progname: Option error. Use -h for help.\n");

$Opt{'debug'} && ($Debug = 1);
$Opt{'help'} && usage(0);
if ($Opt{'version'}) {
    print_version();
    exit(0);
}

$Opt{'zero'} && ($/ = "\x00");

# Dobbelt opp for å få opp farta.

if ($Opt{'skip-invalid'}) {
    while (<>) {
        my $Orig = $_;
        if (is_utf8($Orig)) {
            $Opt{'quiet'} || print $Orig;
        } else {
            $Retval = 1;
        }
    }
} else {
    while (<>) {
        my $Orig = $_;
        if (!is_utf8($Orig)) {
            $Retval = 1;
            $Opt{'quiet'} && last || print $Orig;
        }
    }
}

exit($Retval);

sub is_utf8 {
    # {{{
    my $text = shift;
    # UTF-8 regexp copied from linux-2.6.git/scripts/checkpatch.pl in 
    # commit ddb503b42960792f3be580f98959add669241a04. Originally from 
    # http://www.w3.org/International/questions/qa-forms-utf-8.en.php .
    # Modified by me to include U+007F and everything below U+0020, not 
    # only \t, \n and \r.

    my $UTF8 = qr {
        [\x00-\x7F]                          # ASCII
        | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
        |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
        | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
        |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
        |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
        | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
        |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
    }x;

    my $retval = ($text !~ m/^$UTF8*$/x) ? 0 : 1;
    return($retval);
    # }}}
} # is_utf8()

sub print_version {
    # Print program version {{{
    print("$progname v$VERSION\n");
    # }}}
} # print_version()

sub usage {
    # Send the help message to stdout {{{
    my $Retval = shift;

    if ($Opt{'verbose'}) {
        print("\n");
        print_version();
    }
    print(<<END);

Usage: $progname [options] [file [files [...]]]

Print lines containing invalid UTF-8. Returns 1 if invalid UTF-8 is 
found, otherwise 0 is returned.

Options:

  -h, --help
    Show this help.
  -q, --quiet
    Don't produce any output, use only return value.
  -s, --skip-invalid
    Vice versa, skip lines with invalid UTF-8 and only print those 
    containing proper UTF-8.
  -v, --verbose
    Increase level of verbosity. Can be repeated.
  -z, zero
    Use NUL character (0x00) as separator instead of \\n. This affects 
    both input and output.
  --version
    Print version information.
  --debug
    Print debugging messages.

END
    exit($Retval);
    # }}}
} # usage()

sub msg {
    # Print a status message to stderr based on verbosity level {{{
    my ($verbose_level, $Txt) = @_;

    if ($Opt{'verbose'} >= $verbose_level) {
        print(STDERR "$progname: $Txt\n");
    }
    # }}}
} # msg()

sub D {
    # Print a debugging message {{{
    $Debug || return;
    my @call_info = caller;
    chomp(my $Txt = shift);
    my $File = $call_info[1];
    $File =~ s#\\#/#g;
    $File =~ s#^.*/(.*?)$#$1#;
    print(STDERR "$File:$call_info[2] $$ $Txt\n");
    return("");
    # }}}
} # D()

__END__

# Plain Old Documentation (POD) {{{

=pod

=head1 NAME



=head1 SYNOPSIS

 [options] [file [files [...]]]

=head1 DESCRIPTION



=head1 OPTIONS

=over 4

=item B<-h>, B<--help>

Print a brief help summary.

=item B<-v>, B<--verbose>

Increase level of verbosity. Can be repeated.

=item B<--version>

Print version information.

=item B<--debug>

Print debugging messages.

=back

=head1 BUGS



=head1 AUTHOR

Made by Øyvind A. Holm S<E<lt>sunny@sunbase.orgE<gt>>.

=head1 COPYRIGHT

Copyleft © Øyvind A. Holm E<lt>sunny@sunbase.orgE<gt>
This is free software; see the file F<COPYING> for legalese stuff.

=head1 LICENCE

This program is free software: you can redistribute it and/or modify it 
under the terms of the GNU General Public License as published by the 
Free Software Foundation, either version 2 of the License, or (at your 
option) any later version.

This program is distributed in the hope that it will be useful, but 
WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along 
with this program.
If not, see L<http://www.gnu.org/licenses/>.

=head1 SEE ALSO

=cut

# }}}

# vim: set fenc=UTF-8 ft=perl fdm=marker ts=4 sw=4 sts=4 et fo+=w :
